<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
  <head>

    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <meta content="Cask Data, Inc." name="author" />
<meta content="Copyright © 2015-2017 Cask Data, Inc." name="copyright" />


    <meta name="git_release" content="6.1.1">
    <meta name="git_hash" content="05fbac36f9f7aadeb44f5728cea35136dbc243e5">
    <meta name="git_timestamp" content="2020-02-09 08:22:47 +0800">
    <title>FileSet Exploration</title>

    <link rel="stylesheet" href="../_static/cdap-bootstrap.css" type="text/css" />
    <link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
    <link rel="stylesheet" href="../_static/bootstrap-3.3.6/css/bootstrap.min.css" type="text/css" />
    <link rel="stylesheet" href="../_static/bootstrap-3.3.6/css/bootstrap-theme.min.css" type="text/css" />
    <link rel="stylesheet" href="../_static/css/bootstrap-sphinx.css" type="text/css" />
    <link rel="stylesheet" href="../_static/css/cdap-dynamicscrollspy-4.css" type="text/css" />
    <link rel="stylesheet" href="../_static/css/jquery.mCustomScrollbar.css" type="text/css" />
    <link rel="stylesheet" href="../_static/css/cdap-jquery.mCustomScrollbar.css" type="text/css" />
    <link rel="stylesheet" href="../_static/css/abixTreeList-2.css" type="text/css" />
    <link rel="stylesheet" href="../_static/cdap-bootstrap.css" type="text/css" />

    <script type="text/javascript">
      var DOCUMENTATION_OPTIONS = {
        URL_ROOT:    '',
        VERSION:     '6.1.1',
        COLLAPSE_INDEX: false,
        FILE_SUFFIX: '.html',
        HAS_SOURCE:  false
      };
    </script>
    <script type="text/javascript" src="../_static/jquery.js"></script>
    <script type="text/javascript" src="../_static/underscore.js"></script>
    <script type="text/javascript" src="../_static/doctools.js"></script>
    <script type="text/javascript" src="../_static/language_data.js"></script>

    <link rel="shortcut icon" href="../_static/favicon.ico"/>
    <link rel="index" title="Index" href="../genindex.html" />
    <link rel="search" title="Search" href="../search.html" />
    <link rel="top" title="Cask Data Application Platform 6.1.1 Documentation" href="../index.html" />
    <link rel="up" title="Data Exploration" href="index.html" />
    <link rel="next" title="Table Exploration" href="tables.html" />
    <link rel="prev" title="Data Exploration" href="index.html" />
    <!-- block extrahead -->
    <meta charset='utf-8'>
    <meta http-equiv='X-UA-Compatible' content='IE=edge,chrome=1'>
    <meta name='viewport' content='width=device-width, initial-scale=1.0, maximum-scale=1'>
    <meta name="apple-mobile-web-app-capable" content="yes">
    <!-- block extrahead end -->

</head>
<body role="document">

<!-- block navbar -->
<div id="navbar" class="navbar navbar-inverse navbar-default navbar-fixed-top">
    <div class="container-fluid">
      <div class="row">
        <div class="navbar-header">
          <!-- .btn-navbar is used as the toggle for collapsed navbar content -->
          <a class="navbar-brand" href="../table-of-contents/../../index.html">
            <span><img alt="CDAP logo" src="../_static/cdap_logo.svg"/></span>
          </a>

          <button type="button" class="navbar-toggle" data-toggle="collapse" data-target=".nav-collapse">
            <span class="icon-bar"></span>
            <span class="icon-bar"></span>
            <span class="icon-bar"></span>
          </button>

          <div class="pull-right">
            <div class="dropdown version-dropdown">
              <a href="#" class="dropdown-toggle" data-toggle="dropdown"
                role="button" aria-haspopup="true" aria-expanded="false">
                v 6.1.1 <span class="caret"></span>
              </a>
              <ul class="dropdown-menu">
                <li><a href="//docs.cdap.io/cdap/5.1.2/en/index.html">v 5.1.2</a></li>
                <li><a href="//docs.cdap.io/cdap/4.3.4/en/index.html">v 4.3.4</a></li>
              </ul>
            </div>
          </div>
          <form class="navbar-form navbar-right navbar-search" action="../search.html" method="get">
            <div class="form-group">
              <div class="navbar-search-image material-icons"></div>
              <input type="text" name="q" class="form-control" placeholder="  Search" />
            </div>
            <input type="hidden" name="check_keywords" value="yes" />
            <input type="hidden" name="area" value="default" />
          </form>

          <div class="collapse navbar-collapse nav-collapse navbar-right navbar-navigation">
            <ul class="nav navbar-nav"><li class="docsite-nav-tab-container"><a class="docsite-nav-tab-link " href="../table-of-contents/../../index.html">简介</a></li><li class="docsite-nav-tab-container"><a class="docsite-nav-tab-link current" href="../table-of-contents/../../guides.html">手册</a></li><li class="docsite-nav-tab-container"><a class="docsite-nav-tab-link " href="../table-of-contents/../../reference-manual/index.html">参考</a></li><li class="docsite-nav-tab-container"><a class="docsite-nav-tab-link " href="../table-of-contents/../../faqs/index.html">帮助</a></li>
            </ul>
          </div>

        </div>
      </div>
    </div>
  </div><!-- block navbar end -->
<!-- block main content -->
<div class="main-container container">
  <div class="row"><div class="col-md-2">
      <div id="sidebar" class="bs-sidenav scrollable-y-outside" role="complementary">
<!-- theme_manual: developer-manual -->
<!-- theme_manual_highlight: guides -->
<!-- sidebar_title_link: ../table-of-contents/../../guides.html -->

  <div role="note" aria-label="manuals links"><h3><a href="../table-of-contents/../../guides.html">Guides</a></h3>

    <ul class="this-page-menu">
      <li class="toctree-l1"><a href="../table-of-contents/../../user-guide/index.html" rel="nofollow">用户手册</a>
      </li>
      <li class="toctree-l1"><b><a href="../table-of-contents/../../developer-manual/index.html" rel="nofollow">开发手册</a></b>
      <nav class="pagenav">
      <ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../index.html"> 简介</a></li>
<li class="toctree-l1"><a class="reference internal" href="../getting-started/index.html"> 入门指南</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../getting-started/sandbox/index.html">CDAP Sandbox</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../getting-started/sandbox/zip.html">二进制 Zip 文件</a></li>
<li class="toctree-l3"><a class="reference internal" href="../getting-started/sandbox/zip.html#cdap-sandbox">启动和停止 CDAP Sandbox</a></li>
<li class="toctree-l3"><a class="reference internal" href="../getting-started/sandbox/virtual-machine.html">虚拟机镜像</a></li>
<li class="toctree-l3"><a class="reference internal" href="../getting-started/sandbox/docker.html">Docker 镜像</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../getting-started/quick-start.html">快速入门</a></li>
<li class="toctree-l2"><a class="reference internal" href="../getting-started/dev-env.html">搭建开发环境</a></li>
<li class="toctree-l2"><a class="reference internal" href="../getting-started/start-stop-cdap.html">启动和停止 CDAP</a></li>
<li class="toctree-l2"><a class="reference internal" href="../getting-started/building-apps.html">构建并运行应用</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../overview/index.html"> 概述</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../overview/anatomy.html"> 大数据应用剖析</a></li>
<li class="toctree-l2"><a class="reference internal" href="../overview/modes.html"> 模式和组件</a></li>
<li class="toctree-l2"><a class="reference internal" href="../overview/abstractions.html"> 核心概念</a></li>
<li class="toctree-l2"><a class="reference internal" href="../overview/interfaces.html"> 编程接口</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../building-blocks/index.html"> 抽象概念</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../building-blocks/core.html"> Core Abstractions</a></li>
<li class="toctree-l2"><a class="reference internal" href="../building-blocks/applications.html"> Applications</a></li>
<li class="toctree-l2"><a class="reference internal" href="../building-blocks/datasets/index.html"> Datasets</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../building-blocks/datasets/overview.html"> Overview</a></li>
<li class="toctree-l3"><a class="reference internal" href="../building-blocks/datasets/table.html"> Table API</a></li>
<li class="toctree-l3"><a class="reference internal" href="../building-blocks/datasets/fileset.html"> FileSets</a></li>
<li class="toctree-l3"><a class="reference internal" href="../building-blocks/datasets/partitioned-fileset.html"> Partitioned FileSets</a></li>
<li class="toctree-l3"><a class="reference internal" href="../building-blocks/datasets/time-partitioned-fileset.html"> TimePartitioned FileSets</a></li>
<li class="toctree-l3"><a class="reference internal" href="../building-blocks/datasets/system-custom.html"> System and Custom Datasets</a></li>
<li class="toctree-l3"><a class="reference internal" href="../building-blocks/datasets/permissions.html"> Dataset Permissions</a></li>
<li class="toctree-l3"><a class="reference internal" href="../building-blocks/datasets/cube.html"> Cube Dataset</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../building-blocks/mapreduce-programs.html"> MapReduce Programs</a></li>
<li class="toctree-l2"><a class="reference internal" href="../building-blocks/plugins.html"> Plugins</a></li>
<li class="toctree-l2"><a class="reference internal" href="../building-blocks/schedules.html"> Schedules</a></li>
<li class="toctree-l2"><a class="reference internal" href="../building-blocks/secure-keys.html"> Secure Keys</a></li>
<li class="toctree-l2"><a class="reference internal" href="../building-blocks/services.html"> Services</a></li>
<li class="toctree-l2"><a class="reference internal" href="../building-blocks/spark-programs.html"> Spark Programs</a></li>
<li class="toctree-l2"><a class="reference internal" href="../building-blocks/workers.html"> Workers</a></li>
<li class="toctree-l2"><a class="reference internal" href="../building-blocks/workflows.html"> Workflows</a></li>
<li class="toctree-l2"><a class="reference internal" href="../building-blocks/artifacts.html"> Artifacts</a></li>
<li class="toctree-l2"><a class="reference internal" href="../building-blocks/program-lifecycle.html"> Program Lifecycle</a></li>
<li class="toctree-l2"><a class="reference internal" href="../building-blocks/namespaces.html"> Namespaces</a></li>
<li class="toctree-l2"><a class="reference internal" href="../building-blocks/transaction-system.html"> Transaction System</a></li>
<li class="toctree-l2"><a class="reference internal" href="../building-blocks/transactional-messaging-system.html"> Transactional Messaging System</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../metadata/index.html"> 元数据</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../metadata/system-metadata.html"> System Metadata</a></li>
<li class="toctree-l2"><a class="reference internal" href="../metadata/discovery-lineage.html"> Discovery and Lineage</a></li>
<li class="toctree-l2"><a class="reference internal" href="../metadata/field-lineage.html"> Field Level Lineage</a></li>
<li class="toctree-l2"><a class="reference internal" href="../metadata/audit-logging.html"> Audit Logging</a></li>
<li class="toctree-l2"><a class="reference internal" href="../metadata/metadata-ui.html"> CDAP Metadata UI</a></li>
<li class="toctree-l2"><a class="reference internal" href="../metadata/programmatic-metadata.html"> Accessing metadata programmatically</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../pipelines/index.html"> 数据流管道</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../pipelines/concepts-design.html"> Concepts and Design</a></li>
<li class="toctree-l2"><a class="reference internal" href="../pipelines/getting-started.html"> Getting Started</a></li>
<li class="toctree-l2"><a class="reference internal" href="../pipelines/studio.html"> CDAP Studio</a></li>
<li class="toctree-l2"><a class="reference internal" href="../pipelines/creating-pipelines.html"> Creating Pipelines</a></li>
<li class="toctree-l2"><a class="reference internal" href="../pipelines/running-pipelines.html"> Running Pipelines</a></li>
<li class="toctree-l2"><a class="reference internal" href="../pipelines/plugin-management.html"> Plugin Management</a></li>
<li class="toctree-l2"><a class="reference internal" href="../pipelines/plugins/index.html"> Plugin Reference</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../pipelines/plugins/actions/index.html"> Action Plugins</a><ul class="simple">
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="../pipelines/plugins/sources/index.html"> Source Plugins</a><ul class="simple">
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="../pipelines/plugins/transforms/index.html"> Transform Plugins</a><ul class="simple">
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="../pipelines/plugins/analytics/index.html"> Analytic Plugins</a><ul class="simple">
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="../pipelines/plugins/sinks/index.html"> Sink Plugins</a><ul class="simple">
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="../pipelines/plugins/shared-plugins/index.html"> Shared Plugins</a><ul>
<li class="toctree-l4"><a class="reference internal" href="../pipelines/plugins/shared-plugins/core.html">CoreValidator</a></li>
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="../pipelines/plugins/post-run-plugins/index.html"> Post-run Plugins</a><ul class="simple">
</ul>
</li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../pipelines/developing-pipelines.html"> Developing Pipelines</a></li>
<li class="toctree-l2"><a class="reference internal" href="../pipelines/developing-plugins/index.html"> Developing Plugins</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../pipelines/developing-plugins/plugin-basics.html">Plugin Basics</a></li>
<li class="toctree-l3"><a class="reference internal" href="../pipelines/developing-plugins/creating-a-plugin.html">Creating a Plugin</a></li>
<li class="toctree-l3"><a class="reference internal" href="../pipelines/developing-plugins/presentation-plugins.html">Plugin Presentation</a></li>
<li class="toctree-l3"><a class="reference internal" href="../pipelines/developing-plugins/testing-plugins.html">Testing Plugins</a></li>
<li class="toctree-l3"><a class="reference internal" href="../pipelines/developing-plugins/packaging-plugins.html">Packaging Plugins</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../pipelines/how-cdap-pipelines-work.html"> How CDAP Pipelines Work</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../cloud-runtimes/index.html"> 云平台运行</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../cloud-runtimes/concepts/index.html"> Concepts</a></li>
<li class="toctree-l2"><a class="reference internal" href="../cloud-runtimes/provisioners/index.html"> Provisioners</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../cloud-runtimes/provisioners/gcp-dataproc.html">Google Dataproc</a></li>
<li class="toctree-l3"><a class="reference internal" href="../cloud-runtimes/provisioners/aws-emr.html">Amazon Elastic MapReduce</a></li>
<li class="toctree-l3"><a class="reference internal" href="../cloud-runtimes/provisioners/remote-hadoop.html">Remote Hadoop</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../cloud-runtimes/profiles/index.html"> Profiles</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../cloud-runtimes/profiles/creating-profiles.html">Creating Profiles</a></li>
<li class="toctree-l3"><a class="reference internal" href="../cloud-runtimes/profiles/assigning-profiles.html">Assigning Profiles</a></li>
<li class="toctree-l3"><a class="reference internal" href="../cloud-runtimes/profiles/admin-controls.html">Admin Controls</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../cloud-runtimes/example/index.html"> Example</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../security/index.html"> 安全</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../security/client-authentication.html">Client Authentication</a></li>
<li class="toctree-l2"><a class="reference internal" href="../security/cdap-authentication-clients-java.html">CDAP Authentication Client for Java</a></li>
<li class="toctree-l2"><a class="reference internal" href="../security/cdap-authentication-clients-python.html">CDAP Authentication Client for Python</a></li>
<li class="toctree-l2"><a class="reference internal" href="../security/custom-authentication.html">Custom Authentication</a></li>
<li class="toctree-l2"><a class="reference internal" href="../security/authorization-extensions.html">Authorization Extensions</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../testing/index.html"> 测试和调试</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../testing/testing.html"> Testing a CDAP Application</a></li>
<li class="toctree-l2"><a class="reference internal" href="../testing/debugging.html"> Debugging</a></li>
<li class="toctree-l2"><a class="reference internal" href="../testing/troubleshooting.html"> Troubleshooting</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../ingesting-tools/index.html"> 数据融合</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../ingesting-tools/cdap-stream-clients-java.html">CDAP Stream Client for Java</a></li>
<li class="toctree-l2"><a class="reference internal" href="../ingesting-tools/cdap-stream-clients-python.html">CDAP Stream Client for Python</a></li>
<li class="toctree-l2"><a class="reference internal" href="../ingesting-tools/cdap-stream-clients-ruby.html">CDAP Stream Client for Ruby</a></li>
<li class="toctree-l2"><a class="reference internal" href="../ingesting-tools/cdap-flume.html">CDAP Flume</a></li>
</ul>
</li>
<li class="toctree-l1 current"><a class="reference internal" href="index.html"> 数据探索</a><ul class="current">
<li class="toctree-l2 current"><a class="current reference internal" href="#"> Fileset Exploration</a></li>
<li class="toctree-l2"><a class="reference internal" href="tables.html"> Table Exploration</a></li>
<li class="toctree-l2"><a class="reference internal" href="object-mapped-tables.html"> ObjectMappedTable Exploration</a></li>
<li class="toctree-l2"><a class="reference internal" href="custom-datasets.html"> Custom Dataset Exploration</a></li>
<li class="toctree-l2"><a class="reference internal" href="hive-execution-engines.html"> Hive Execution Engines</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/index.html"> 高级主题</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../advanced/application-logback.html"> Application Logback</a></li>
<li class="toctree-l2"><a class="reference internal" href="../advanced/best-practices.html"> Best Practices</a></li>
<li class="toctree-l2"><a class="reference internal" href="../advanced/class-loading.html"> Class Loading</a></li>
<li class="toctree-l2"><a class="reference internal" href="../advanced/configuring-resources.html"> Configuring Program Resources</a></li>
<li class="toctree-l2"><a class="reference internal" href="../advanced/program-retry-policies.html"> Program Retry Policies</a></li>
</ul>
</li>
</ul>
</nav>
      </li>
      <li class="toctree-l1"><a href="../table-of-contents/../../admin-manual/index.html" rel="nofollow">管理手册</a>
      </li>
      <li class="toctree-l1"><a href="../table-of-contents/../../integrations/index.html" rel="nofollow">集成手册</a>
      </li>
      <li class="toctree-l1"><a href="../table-of-contents/../../examples-manual/index.html" rel="nofollow">最佳实践</a>
      </li>
    </ul>
  </div></div>
    </div><div class="col-md-8 content" id="main-content">
    
  <div class="section" id="fileset-exploration">
<span id="id1"></span><h1>FileSet Exploration<a class="headerlink" href="#fileset-exploration" title="Permalink to this headline">🔗</a></h1>
<p>The <code class="docutils literal notranslate"><span class="pre">FileSet</span></code>, <code class="docutils literal notranslate"><span class="pre">PartitionedFileSet</span></code>, and <code class="docutils literal notranslate"><span class="pre">TimePartitionedFileSet</span></code> datasets can be
explored through ad-hoc SQL-like queries. To enable exploration, you must set several
properties when creating the dataset, and the files in your dataset must meet certain
requirements. These properties and requirements are described below.</p>
<div class="section" id="explore-properties">
<h2>Explore Properties<a class="headerlink" href="#explore-properties" title="Permalink to this headline">🔗</a></h2>
<p>A <code class="docutils literal notranslate"><span class="pre">FileSet</span></code>, <code class="docutils literal notranslate"><span class="pre">PartitionedFileSet</span></code>, or <code class="docutils literal notranslate"><span class="pre">TimePartitionedFileSet</span></code> is made explorable by setting several properties when
creating the dataset. The <code class="docutils literal notranslate"><span class="pre">FileSetProperties</span></code> class (<code class="docutils literal notranslate"><span class="pre">PartitionedFileSetProperties</span></code> or <code class="docutils literal notranslate"><span class="pre">TimePartitionedFileSetsProperties</span></code>
classes for the other two types) should be used to set the following required properties:</p>
<ul class="simple">
<li><code class="docutils literal notranslate"><span class="pre">EnableExploreOnCreate</span></code> must be set to true to create a Hive table when the dataset is created</li>
<li><code class="docutils literal notranslate"><span class="pre">SerDe</span></code> class that Hive should use for serialization and deserialization</li>
<li><code class="docutils literal notranslate"><span class="pre">InputFormat</span></code> that Hive should use for reading files</li>
<li><code class="docutils literal notranslate"><span class="pre">OutputFormat</span></code> that Hive should use for writing files</li>
</ul>
<p>Any other table properties that the SerDe may need must also be set.
For example, in the configure method of your application:</p>
<div class="highlight-java notranslate"><div class="highlight"><pre><span></span><span class="n">Schema</span> <span class="n">schema</span> <span class="o">=</span> <span class="n">Schema</span><span class="p">.</span><span class="na">recordOf</span><span class="p">(</span>
  <span class="s">&quot;purchase&quot;</span><span class="p">,</span>
  <span class="n">Schema</span><span class="p">.</span><span class="na">Field</span><span class="p">.</span><span class="na">of</span><span class="p">(</span><span class="s">&quot;user&quot;</span><span class="p">,</span> <span class="n">Schema</span><span class="p">.</span><span class="na">of</span><span class="p">(</span><span class="n">Schema</span><span class="p">.</span><span class="na">Type</span><span class="p">.</span><span class="na">STRING</span><span class="p">)),</span>
  <span class="n">Schema</span><span class="p">.</span><span class="na">Field</span><span class="p">.</span><span class="na">of</span><span class="p">(</span><span class="s">&quot;item_id&quot;</span><span class="p">,</span> <span class="n">Schema</span><span class="p">.</span><span class="na">of</span><span class="p">(</span><span class="n">Schema</span><span class="p">.</span><span class="na">Type</span><span class="p">.</span><span class="na">INT</span><span class="p">)),</span>
  <span class="n">Schema</span><span class="p">.</span><span class="na">Field</span><span class="p">.</span><span class="na">of</span><span class="p">(</span><span class="s">&quot;price&quot;</span><span class="p">,</span> <span class="n">Schema</span><span class="p">.</span><span class="na">of</span><span class="p">(</span><span class="n">Schema</span><span class="p">.</span><span class="na">Type</span><span class="p">.</span><span class="na">DOUBLE</span><span class="p">))</span>
<span class="p">);</span>
<span class="n">createDataset</span><span class="p">(</span><span class="s">&quot;myfiles&quot;</span><span class="p">,</span> <span class="s">&quot;fileSet&quot;</span><span class="p">,</span> <span class="n">FileSetProperties</span><span class="p">.</span><span class="na">builder</span><span class="p">()</span>
  <span class="p">.</span><span class="na">setBasePath</span><span class="p">(</span><span class="s">&quot;mylocation&quot;</span><span class="p">)</span>
  <span class="p">.</span><span class="na">setInputFormat</span><span class="p">(</span><span class="n">AvroKeyInputFormat</span><span class="p">.</span><span class="na">class</span><span class="p">)</span>
  <span class="p">.</span><span class="na">setOutputFormat</span><span class="p">(</span><span class="n">AvroKeyOutputFormat</span><span class="p">.</span><span class="na">class</span><span class="p">)</span>
  <span class="c1">// everything past here is a CDAP Explore property</span>
  <span class="p">.</span><span class="na">setEnableExploreOnCreate</span><span class="p">(</span><span class="kc">true</span><span class="p">)</span>
  <span class="p">.</span><span class="na">setSerDe</span><span class="p">(</span><span class="s">&quot;org.apache.hadoop.hive.serde2.avro.AvroSerDe&quot;</span><span class="p">)</span>
  <span class="p">.</span><span class="na">setExploreInputFormat</span><span class="p">(</span><span class="s">&quot;org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat&quot;</span><span class="p">)</span>
  <span class="p">.</span><span class="na">setExploreOutputFormat</span><span class="p">(</span><span class="s">&quot;org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat&quot;</span><span class="p">)</span>
  <span class="p">.</span><span class="na">setTableProperty</span><span class="p">(</span><span class="s">&quot;avro.schema.literal&quot;</span><span class="p">,</span> <span class="n">schema</span><span class="p">.</span><span class="na">toString</span><span class="p">())</span>
  <span class="p">.</span><span class="na">build</span><span class="p">());</span>
</pre></div>
</div>
<p>For a <code class="docutils literal notranslate"><span class="pre">FileSet</span></code> using the <code class="docutils literal notranslate"><span class="pre">AvroParquet</span></code> format:</p>
<div class="highlight-java notranslate"><div class="highlight"><pre><span></span><span class="n">FileSetProperties</span><span class="p">.</span><span class="na">builder</span><span class="p">()</span>
<span class="p">.</span><span class="na">setBasePath</span><span class="p">(</span><span class="n">basePath</span><span class="p">)</span>
<span class="p">.</span><span class="na">setInputFormat</span><span class="p">(</span><span class="n">AvroParquetInputFormat</span><span class="p">.</span><span class="na">class</span><span class="p">)</span>
<span class="p">.</span><span class="na">setOutputFormat</span><span class="p">(</span><span class="n">AvroParquetOutputFormat</span><span class="p">.</span><span class="na">class</span><span class="p">)</span>
<span class="p">.</span><span class="na">setEnableExploreOnCreate</span><span class="p">(</span><span class="kc">true</span><span class="p">)</span>
<span class="p">.</span><span class="na">setExploreFormat</span><span class="p">(</span><span class="s">&quot;parquet&quot;</span><span class="p">)</span>
<span class="p">.</span><span class="na">setExploreSchema</span><span class="p">(</span><span class="s">&quot;id long, name string&quot;</span><span class="p">)</span>
</pre></div>
</div>
<p>A <code class="docutils literal notranslate"><span class="pre">PartitionedFileSet</span></code> using the <code class="docutils literal notranslate"><span class="pre">text</span></code> format, with <code class="docutils literal notranslate"><span class="pre">\n</span></code> as the record delimiter:</p>
<div class="highlight-java notranslate"><div class="highlight"><pre><span></span><span class="n">PartitionedFileSetProperties</span><span class="p">.</span><span class="na">builder</span><span class="p">()</span>
<span class="c1">// Properties for partitioning</span>
<span class="p">.</span><span class="na">setPartitioning</span><span class="p">(</span><span class="n">Partitioning</span><span class="p">.</span><span class="na">builder</span><span class="p">().</span><span class="na">addLongField</span><span class="p">(</span><span class="s">&quot;time&quot;</span><span class="p">).</span><span class="na">build</span><span class="p">())</span>
<span class="c1">// Properties for file set</span>
<span class="p">.</span><span class="na">setInputFormat</span><span class="p">(</span><span class="n">TextInputFormat</span><span class="p">.</span><span class="na">class</span><span class="p">)</span>
<span class="p">.</span><span class="na">setOutputFormat</span><span class="p">(</span><span class="n">TextOutputFormat</span><span class="p">.</span><span class="na">class</span><span class="p">)</span>
<span class="p">.</span><span class="na">setOutputProperty</span><span class="p">(</span><span class="n">TextOutputFormat</span><span class="p">.</span><span class="na">SEPERATOR</span><span class="p">,</span> <span class="s">&quot;,&quot;</span><span class="p">)</span>
<span class="c1">// enable CDAP Explore</span>
<span class="p">.</span><span class="na">setEnableExploreOnCreate</span><span class="p">(</span><span class="kc">true</span><span class="p">)</span>
<span class="p">.</span><span class="na">setExploreFormat</span><span class="p">(</span><span class="s">&quot;text&quot;</span><span class="p">)</span>
<span class="p">.</span><span class="na">setExploreFormatProperty</span><span class="p">(</span><span class="s">&quot;delimiter&quot;</span><span class="p">,</span> <span class="s">&quot;\n&quot;</span><span class="p">)</span>
<span class="p">.</span><span class="na">setExploreSchema</span><span class="p">(</span><span class="s">&quot;record STRING&quot;</span><span class="p">)</span>
<span class="p">.</span><span class="na">build</span><span class="p">()</span>
</pre></div>
</div>
<p>If you are running a version of Hive that reserves keywords and any of your column names is a <a class="reference external" href="https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-Keywords,Non-reservedKeywordsandReservedKeywords">Hive reserved keyword</a>, you will need to enclose the column name in backticks.
For example:</p>
<div class="highlight-java notranslate"><div class="highlight"><pre><span></span><span class="n">PartitionedFileSetProperties</span><span class="p">.</span><span class="na">builder</span><span class="p">()</span>
<span class="c1">// Properties for partitioning</span>
<span class="p">.</span><span class="na">setPartitioning</span><span class="p">(</span><span class="n">Partitioning</span><span class="p">.</span><span class="na">builder</span><span class="p">().</span><span class="na">addLongField</span><span class="p">(</span><span class="s">&quot;time&quot;</span><span class="p">).</span><span class="na">build</span><span class="p">())</span>
<span class="c1">// Properties for file set</span>
<span class="p">.</span><span class="na">setInputFormat</span><span class="p">(</span><span class="n">TextInputFormat</span><span class="p">.</span><span class="na">class</span><span class="p">)</span>
<span class="p">.</span><span class="na">setOutputFormat</span><span class="p">(</span><span class="n">TextOutputFormat</span><span class="p">.</span><span class="na">class</span><span class="p">)</span>
<span class="p">.</span><span class="na">setOutputProperty</span><span class="p">(</span><span class="n">TextOutputFormat</span><span class="p">.</span><span class="na">SEPERATOR</span><span class="p">,</span> <span class="s">&quot;,&quot;</span><span class="p">)</span>
<span class="c1">// enable CDAP Explore</span>
<span class="p">.</span><span class="na">setEnableExploreOnCreate</span><span class="p">(</span><span class="kc">true</span><span class="p">)</span>
<span class="p">.</span><span class="na">setExploreFormat</span><span class="p">(</span><span class="s">&quot;text&quot;</span><span class="p">)</span>
<span class="p">.</span><span class="na">setExploreFormatProperty</span><span class="p">(</span><span class="s">&quot;delimiter&quot;</span><span class="p">,</span> <span class="s">&quot;\n&quot;</span><span class="p">)</span>
<span class="p">.</span><span class="na">setExploreSchema</span><span class="p">(</span><span class="s">&quot;`date` STRING&quot;</span><span class="p">)</span>
<span class="p">.</span><span class="na">build</span><span class="p">()</span>
</pre></div>
</div>
<p>These dataset properties map directly to table properties in Hive. In the case of the
<code class="docutils literal notranslate"><span class="pre">setBasePath</span></code> method, the partial-path given will be a sub-directory of
<code class="docutils literal notranslate"><span class="pre">&lt;CDAP-home&gt;/namespaces/&lt;namespace-id&gt;/data/</span></code>.</p>
<p>For example, if <code class="docutils literal notranslate"><span class="pre">&lt;CDAP-home&gt;</span></code> is <em>/cdap</em>, and <code class="docutils literal notranslate"><span class="pre">&lt;namespace-id&gt;</span></code> is <em>default</em>,
the first Dataset example above would result in this “create table” statement being generated:</p>
<div class="highlight-java notranslate"><div class="highlight"><pre><span></span><span class="n">CREATE</span> <span class="n">EXTERNAL</span> <span class="n">TABLE</span> <span class="nf">dataset_myfiles</span><span class="p">(</span>
  <span class="n">user</span> <span class="n">string</span><span class="p">,</span>
  <span class="n">item_id</span> <span class="kt">int</span><span class="p">,</span>
  <span class="n">price</span> <span class="kt">double</span><span class="p">)</span>
<span class="n">ROW</span> <span class="n">FORMAT</span> <span class="n">SERDE</span> <span class="s">&quot;org.apache.hadoop.hive.serde2.avro.AvroSerDe&quot;</span>
<span class="n">STORED</span> <span class="n">AS</span> <span class="n">INPUTFORMAT</span> <span class="s">&quot;org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat&quot;</span>
<span class="n">OUTPUTFORMAT</span> <span class="s">&quot;org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat&quot;</span>
<span class="n">LOCATION</span> <span class="s">&quot;/cdap/namespaces/default/data/mylocation&quot;</span>
<span class="n">TBLPROPERTIES</span> <span class="p">(</span>
  <span class="s">&quot;avro.schema.literal&quot;</span><span class="o">=</span><span class="s">&quot;{\&quot;type\&quot;: \&quot;record\&quot;, \&quot;name\&quot;: \&quot;stringBody\&quot;, \&quot;fields\&quot;: [{ \&quot;name\&quot;:\&quot;ts\&quot;, \&quot;type\&quot;:\&quot;long\&quot; }, { \&quot;name\&quot;:\&quot;body\&quot;, \&quot;type\&quot;:\&quot;string\&quot; } ] }&quot;</span>
<span class="p">);</span>
</pre></div>
</div>
<p>Please see the <a class="reference external" href="https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-Create/Drop/TruncateTable">Hive Language Manual</a>
for more information about input formats, output format, SerDes, and table properties.</p>
<p>Note that the <code class="docutils literal notranslate"><span class="pre">setUseExisting()</span></code> and <code class="docutils literal notranslate"><span class="pre">setPossessExisting()</span></code> methods of FileSet properties—as described in <a class="reference internal" href="../building-blocks/datasets/fileset.html#datasets-fileset"><span class="std std-ref">FileSet Dataset</span></a>—also apply to the explore table. This is
especially important for PartitionedFileSets:</p>
<ul class="simple">
<li><code class="docutils literal notranslate"><span class="pre">setUseExisting(true)</span></code> has the effect that an existing Hive table can be used. But when the
dataset is dropped or truncated, or when explore is disabled for the dataset, the Hive table
will remain unaffected.</li>
<li><code class="docutils literal notranslate"><span class="pre">setPossessExisting(true)</span></code> directs the dataset to take possession of an existing Hive table.
That means that when the dataset is dropped or truncated, or when exploration is disabled for the dataset,
the Hive table will be dropped or cleared from all partitions.</li>
</ul>
</div>
<div class="section" id="limitations">
<h2>Limitations<a class="headerlink" href="#limitations" title="Permalink to this headline">🔗</a></h2>
<p>There are several limitations for fileset exploration:</p>
<ul class="simple">
<li>All explorable files must be in a format supported by your version of Hive.</li>
<li>Your version of Hive must include <a class="reference external" href="https://cwiki.apache.org/confluence/display/Hive/SerDe#SerDe-Built-inSerDes">the appropriate SerDe</a>.</li>
<li>Some versions of Hive may try to create a temporary staging directory at the table location when executing queries.
If you are seeing permissions errors, try setting <code class="docutils literal notranslate"><span class="pre">hive.exec.stagingdir</span></code> in your Hive configuration to <code class="docutils literal notranslate"><span class="pre">/tmp/hive-staging</span></code>.</li>
</ul>
<p>A <code class="docutils literal notranslate"><span class="pre">FileSet</span></code> has some additional limitations that the <code class="docutils literal notranslate"><span class="pre">PartitionedFileSet</span></code> or <code class="docutils literal notranslate"><span class="pre">TimePartitionedFileSet</span></code> do not have:</p>
<ul class="simple">
<li>Hive tables created by a <code class="docutils literal notranslate"><span class="pre">FileSet</span></code> are not partitioned; this means all queries perform a full table scan.</li>
<li>Only files at the base location of the <code class="docutils literal notranslate"><span class="pre">FileSet</span></code> are visible to queries. Directories are not read.
Since MapReduce writes output files to a directory, you must move all output files to the base location for
MapReduce output to be explorable.</li>
</ul>
<p>If you wish to use Impala to explore a <code class="docutils literal notranslate"><span class="pre">FileSet</span></code>, <code class="docutils literal notranslate"><span class="pre">PartitionedFileSet</span></code>, or <code class="docutils literal notranslate"><span class="pre">TimePartitionedFileSet</span></code>, there are several
additional restrictions you must keep in mind:</p>
<ul class="simple">
<li>Impala only supports scalar types. See <a class="reference external" href="http://www.cloudera.com/content/cloudera/en/documentation/cloudera-impala/latest/topics/impala_avro.html#avro_data_types_unique_1">Data Type Considerations for Avro Tables</a>
for details.</li>
<li>If your underlying data contains non-scalars, you cannot tell Impala to use a different read schema of just scalars.
For example, if you have Avro files that contain a map field, you cannot simply leave out the field when specifying the table schema.</li>
<li>Impala caches table metadata, which clients must invalidate when there are changes.
You have to issue an <code class="docutils literal notranslate"><span class="pre">INVALIDATE</span> <span class="pre">METADATA</span> <span class="pre">[tablename]</span></code> command whenever table metadata changes.
You’ll need to invalidate metadata if a new table or table partition is added. Otherwise, Impala will use the
cached table metadata, preventing you from seeing the changes. If data is added to a table without changing the
metadata (such as when adding a partition), then you need to issue a <code class="docutils literal notranslate"><span class="pre">REFRESH</span> <span class="pre">[tablename]</span></code> command to force
Impala to see the changes. Though Impala also caches info on table files and blocks, any calls to the
<code class="docutils literal notranslate"><span class="pre">REFRESH</span></code> command will cause it to re-read the information.</li>
</ul>
</div>
</div>

</div>
    <div class="col-md-2">
      <div id="right-sidebar" class="bs-sidenav scrollable-y" role="complementary">
        <div id="localtoc-scrollspy">
        </div>
      </div>
    </div></div>
</div>
<!-- block main content end -->
<!-- block footer -->
<footer class="footer">
      <div class="container">
        <div class="row">
          <div class="col-md-2 footer-left"><a title="Data Exploration" href="index.html" />Previous</a></div>
          <div class="col-md-8 footer-center"><a class="footer-tab-link" href="../table-of-contents/../../reference-manual/licenses/index.html">Copyright</a> &copy; 2014-2020 Cask Data, Inc.&bull; <a class="footer-tab-link" href="//docs.cask.co/cdap/6.1.1/cdap-docs-6.1.1-web.zip" rel="nofollow">Download</a> an archive or
<a class="footer-tab-link" href="//docs.cask.co/cdap">switch the version</a> of the documentation
          </div>
          <div class="col-md-2 footer-right"><a title="Table Exploration" href="tables.html" />Next</a></div>
        </div>
      </div>
    </footer>
<!-- block footer end -->
<script type="text/javascript" src="../_static/bootstrap-3.3.6/js/bootstrap.min.js"></script><script type="text/javascript" src="../_static/js/bootstrap-sphinx.js"></script><script type="text/javascript" src="../_static/js/abixTreeList-2.js"></script><script type="text/javascript" src="../_static/js/cdap-dynamicscrollspy-4.js"></script><script type="text/javascript" src="../_static/js/cdap-version-menu.js"></script><script type="text/javascript" src="../_static/js/copy-to-clipboard.js"></script><script type="text/javascript" src="../_static/js/jquery.mousewheel.min.js"></script><script type="text/javascript" src="../_static/js/jquery.mCustomScrollbar.js"></script><script type="text/javascript" src="../_static/js/js.cookie.js"></script><script type="text/javascript" src="../_static/js/tabbed-parsed-literal-0.2.js"></script><script type="text/javascript" src="../_static/js/cdap-onload-javascript.js"></script><script type="text/javascript" src="../_static/js/cdap-version-menu.js"></script>
    <script src="https://cdap.gitee.io/docs/cdap/json-versions.js"/></script>
  </body>
</html>